Univariate Time-Series Models (ARIMA/SARIMA)

This section will contain the initial models to better understand the various time series datasets included in this project. This involves determining how to properly augment the data, selecting model parameters, fitting models, diagnosing them, and obtaining forecasts to contextualize results.

Note: Much of the code and process on this page is re-purposed from:

Gamage, P. (2026). Applied Time Series for Data Science.

Code
library(tidyverse)
library(ggplot2)
library(forecast)
library(astsa) 
library(xts)
library(tseries)
library(fpp2)
library(fma)
library(lubridate)
library(tidyverse)
library(TSstudio)
library(quantmod)
library(tidyquant)
library(plotly)
library(ggplot2)
library(dplyr)

uber <- read_csv("../data/uber.csv")
lyft <- read_csv("../data/lyft.csv")
zoom <- read_csv("../data/zoom.csv")
microsoft <- read_csv("../data/microsoft.csv")
cisco <- read_csv("../data/cisco.csv")
bp <- read_csv("../data/bp.csv")
shell <- read_csv("../data/shell.csv")
exxon <- read_csv("../data/exxon.csv")
chevron <- read_csv("../data/chevron.csv")
ridership <- read_csv("../data/ridership.csv")
mta <- read_csv("../data/mta.csv")
la <- read_csv("../data/la.csv")
cta <- read_csv("../data/cta.csv")
rideshare <- read_csv("../data/rideshare.csv")
telecom <- read_csv("../data/telecom.csv")
oil <- read_csv("../data/oil.csv")
gas <- read_csv("../data/gas_prices.csv")
gas$Month <- as.Date(paste0(gas$Month, "-01"), format = "%b-%y-%d")
gas <- gas[gas$Month >= as.Date("1993-04-01"),]
gas <- gas[gas$Month <= as.Date("2024-01-01"),]
unemp <- read_csv("../data/unemployment.csv")
unemp <- unemp[unemp$observation_date >= as.Date("1993-04-01"),]
unemp <- unemp[unemp$observation_date <= as.Date("2024-01-01"),]
cars <- read_csv("../data/monthly_miles.csv")
cars$observation_date <- as.Date(cars$observation_date, format = "%m/%d/%y")
cars <- cars[cars$observation_date >= as.Date("1993-04-01"),]
cars <- cars[cars$observation_date <= as.Date("2024-01-01"),]

Determining Stationarity

Prior to fitting any time series modeling to these datasets, we must determine whether they are stationary to decide if data augmentation needs to be applied. Some of these steps were executed in the Exporatory Data Analysis section, so this will be a summation of what drives our decisions.

Code
library(gridExtra)

Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':

    combine
Code
ridership_ts <- ts(ridership$`Total Ridership (000s)`, start=c(1992,1), frequency = 4)
ridership_acf <- ggAcf(ridership_ts)+ggtitle("ACF Plot for Public Transit Ridership") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
ridership_pacf <- ggPacf(ridership_ts)+ggtitle("PACF Plot for Public Transit Ridership") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(ridership_acf, ridership_pacf, nrow=2)

Code
tseries::adf.test(na.remove(ridership_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(ridership_ts)
Dickey-Fuller = -1.8774, Lag order = 5, p-value = 0.6275
alternative hypothesis: stationary
Code
mta_ts <- ts(mta$total_ridership, start=c(2020,3,1), frequency = 365.25)
mta_acf <- ggAcf(mta_ts, lag.max = 30)+ggtitle("ACF Plot for MTA (New York City) Ridership") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
mta_pacf <- ggPacf(mta_ts, lag.max = 30)+ggtitle("PACF Plot for MTA (New York City) Ridership") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(mta_acf, mta_pacf, nrow=2)

Code
tseries::adf.test(na.remove(mta_ts))
Warning in tseries::adf.test(na.remove(mta_ts)): p-value smaller than printed
p-value

    Augmented Dickey-Fuller Test

data:  na.remove(mta_ts)
Dickey-Fuller = -6.554, Lag order = 12, p-value = 0.01
alternative hypothesis: stationary
Code
cta_ts <- ts(cta$total_rides, start=c(2015,1,1), frequency = 365.25)
cta_acf <- ggAcf(cta_ts, lag.max = 30)+ggtitle("ACF Plot for CTA (Chicago) Ridership") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
cta_pacf <- ggPacf(cta_ts, lag.max = 30)+ggtitle("PACF Plot for CTA (Chicago) Ridership") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(cta_acf, cta_pacf, nrow=2)

Code
tseries::adf.test(na.remove(cta_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(cta_ts)
Dickey-Fuller = -3.8003, Lag order = 15, p-value = 0.01895
alternative hypothesis: stationary
Code
cars_ts <- ts(cars$M12MTVUSM227NFWA, start=c(1993,1), frequency = 12)
cars_acf <- ggAcf(cars_ts, lag.max = 30)+ggtitle("ACF Plot for Vehicle Miles") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
cars_pacf <- ggPacf(cars_ts, lag.max = 30)+ggtitle("PACF Plot for Vehicle Miles") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(cars_acf, cars_pacf, nrow=2)

Code
tseries::adf.test(na.remove(cars_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(cars_ts)
Dickey-Fuller = -2.7693, Lag order = 7, p-value = 0.2523
alternative hypothesis: stationary
Code
gas_ts <- ts(gas$`Price per Gallon`, start=c(1993,1), frequency = 12)
gas_acf <- ggAcf(gas_ts, lag.max = 30)+ggtitle("ACF Plot for Gas Prices") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
gas_pacf <- ggPacf(gas_ts, lag.max = 30)+ggtitle("PACF Plot for Gas Prices") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(gas_acf, gas_pacf, nrow=2)

Code
tseries::adf.test(na.remove(gas_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(gas_ts)
Dickey-Fuller = -2.484, Lag order = 7, p-value = 0.3727
alternative hypothesis: stationary
Code
unemp_ts <- ts(unemp$UNRATE, start=c(1993,1), frequency = 12)
unemp_acf <- ggAcf(unemp_ts, lag.max = 30)+ggtitle("ACF Plot for Unemployment Rate") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
unemp_pacf <- ggPacf(unemp_ts, lag.max = 30)+ggtitle("PACF Plot for Unemployment Rate") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(unemp_acf, unemp_pacf, nrow=2)

Code
tseries::adf.test(na.remove(unemp_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(unemp_ts)
Dickey-Fuller = -2.3338, Lag order = 7, p-value = 0.4361
alternative hypothesis: stationary

Uber

Code
uber_ts <- ts(uber$UBER.Close, start=c(2020,1,1), frequency = 252)
uber_acf <- ggAcf(uber_ts, lag.max = 30)+ggtitle("ACF Plot for Uber Stock Price") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
uber_pacf <- ggPacf(uber_ts)+ggtitle("PACF Plot for Uber Stock Price") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(uber_acf, uber_pacf, nrow=2)

Code
tseries::adf.test(na.remove(uber_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(uber_ts)
Dickey-Fuller = -1.5743, Lag order = 11, p-value = 0.7585
alternative hypothesis: stationary

Zoom

Code
zoom_ts <- ts(zoom$ZM.Close, start=c(2020,1,1), frequency = 252)
zoom_acf <- ggAcf(zoom_ts, lag.max = 30)+ggtitle("ACF Plot for Zoom Stock Price") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
zoom_pacf <- ggPacf(zoom_ts, lag.max = 30)+ggtitle("PACF Plot for Zoom Stock Price") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(zoom_acf, zoom_pacf, nrow=2)

Code
tseries::adf.test(na.remove(zoom_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(zoom_ts)
Dickey-Fuller = -2.5477, Lag order = 11, p-value = 0.3465
alternative hypothesis: stationary

Exxon

Code
exxon_ts <- ts(exxon$XOM.Close, start=c(2020,1,1), frequency = 252)
exxon_acf <- ggAcf(exxon_ts, lag.max = 30)+ggtitle("ACF Plot for Exxon Stock Price") + theme_bw() +
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
exxon_pacf <- ggPacf(exxon_ts, lag.max = 30)+ggtitle("PACF Plot for Exxon Stock Price") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
grid.arrange(exxon_acf, exxon_pacf, nrow=2)

Code
tseries::adf.test(na.remove(exxon_ts))

    Augmented Dickey-Fuller Test

data:  na.remove(exxon_ts)
Dickey-Fuller = -2.5443, Lag order = 11, p-value = 0.3479
alternative hypothesis: stationary

In all of these cases, the time series data are not stationary. This is evident due to the large values in the ACF plots, despite some ADF tests suggesting otherwise. Thus, we are likely going to need to difference the data in all cases. The following section compares differenced data to the original.

Differencing

Code
fit_ridership <- lm(na.remove(ridership_ts)~time(na.remove(ridership_ts)), na.action=NULL)
plot1 <- ggAcf(ridership_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_ridership), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(ridership_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(ridership_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Code
fit_mta <- lm(na.remove(mta_ts)~time(na.remove(mta_ts)), na.action=NULL)
plot1 <- ggAcf(mta_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_mta), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(mta_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(mta_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Code
fit_cta <- lm(na.remove(cta_ts)~time(na.remove(cta_ts)), na.action=NULL)
plot1 <- ggAcf(cta_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_cta), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(cta_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(cta_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Code
fit_cars <- lm(na.remove(cars_ts)~time(na.remove(cars_ts)), na.action=NULL)
plot1 <- ggAcf(cars_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_cars), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(cars_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(cars_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Code
fit_gas <- lm(na.remove(gas_ts)~time(na.remove(gas_ts)), na.action=NULL)
plot1 <- ggAcf(gas_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_gas), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(gas_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(gas_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Code
fit_unemp <- lm(na.remove(unemp_ts)~time(na.remove(unemp_ts)), na.action=NULL)
plot1 <- ggAcf(unemp_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_unemp), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(unemp_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(unemp_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Uber

Code
fit_uber <- lm(na.remove(uber_ts)~time(na.remove(uber_ts)), na.action=NULL)
plot1 <- ggAcf(uber_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_uber), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(uber_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(uber_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Zoom

Code
fit_zoom <- lm(na.remove(zoom_ts)~time(na.remove(zoom_ts)), na.action=NULL)
plot1 <- ggAcf(zoom_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_zoom), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(zoom_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(zoom_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

Exxon

Code
fit_exxon <- lm(na.remove(exxon_ts)~time(na.remove(exxon_ts)), na.action=NULL)
plot1 <- ggAcf(exxon_ts, 48, main="Original Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <- ggAcf(resid(fit_exxon), 48, main="Detrended Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <- ggAcf(diff(exxon_ts), 48, main="First Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <- ggAcf(diff(diff(exxon_ts)), 48, main="Second Differenced Data") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9") 
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)

In each of these cases, it is clear that first-order differencing is both necessary and sufficient. To avoid over-differencing the data, we will choose \(d=1\) for all time series data. The next section will address other parameters.

Deciding Parameters

Based on the ACF and PACF plots of the original, detrended, and differenced data at multiple orders of difference, below we will outline chosen parameters \(p\), \(d\), and \(q\) for each time series.

Code
plot1 <- ggAcf(diff(ridership_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(ridership_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

Based on the low ACF values, it seems like an ARIMA(0,1,0) model would suffice here.

Code
plot1 <- ggAcf(diff(mta_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(mta_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

These plots indicate that an ARIMA(7,1,2) model would be best.

Code
plot1 <- ggAcf(diff(cta_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(cta_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

These plots indicate that an ARIMA(7,1,2) model would be best.

Code
plot1 <- ggAcf(diff(cars_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(cars_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

Due to the clear seasonal component, these plots indicate that a SARIMA model with seasonal period equal to 12 would be best. The other parameters are difficult to capture here, so we will use auto.arima() for that.

Code
plot1 <- ggAcf(diff(gas_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(gas_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

Due to the clear seasonal component, these plots indicate that a SARIMA model with seasonal period equal to 12 would be best. The other parameters are difficult to capture here, so we will use auto.arima() for that.

Code
plot1 <- ggAcf(diff(unemp_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(unemp_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

These plots indicate that an ARIMA(2,1,2) model would be best

Uber

Code
plot1 <- ggAcf(diff(uber_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(uber_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

Zoom

Code
plot1 <- ggAcf(diff(zoom_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(zoom_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

Exxon

Code
plot1 <- ggAcf(diff(exxon_ts), 48, main="First Differenced Data - ACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
plot2 <- ggPacf(diff(exxon_ts), 48, main="First Differenced Data - PACF") + theme_bw()+
  geom_segment(lineend = "butt", color = "#68A2B9") +
    geom_hline(yintercept = 0, color = "#68A2B9")
grid.arrange(plot1, plot2, nrow=2)

For each financial time series, it appears that ARIMA(0,1,0) is the most appropriate model.

Using auto.arima()

While it can be useful to select model parameters based on exploratory data analysis from ACF and PACF plots, there is value in finding the best model via computational methods. In this section, we will apply the auto.arima() function to each time series to get a second option (or validate our first option) for fitting a model.

Code
auto.arima(ridership_ts)
Series: ridership_ts 
ARIMA(0,1,0) 

sigma^2 = 2.626e+10:  log likelihood = -1717.06
AIC=3436.12   AICc=3436.15   BIC=3438.97
Code
auto.arima(mta_ts, seasonal = 7, max.p = 7, max.q = 7)
Series: mta_ts 
ARIMA(6,1,2) 

Coefficients:
          ar1      ar2      ar3      ar4      ar5     ar6     ma1     ma2
      -0.7494  -0.9447  -0.7991  -0.8257  -0.8687  -0.709  0.2711  0.3427
s.e.   0.0255   0.0159   0.0206   0.0190   0.0144   0.021  0.0299  0.0350

sigma^2 = 2.654e+11:  log likelihood = -25863.7
AIC=51745.41   AICc=51745.51   BIC=51794.74
Code
auto.arima(cta_ts, seasonal = 7, max.p = 7, max.q = 7)
Series: cta_ts 
ARIMA(7,1,1) 

Coefficients:
          ar1      ar2      ar3      ar4      ar5      ar6     ar7      ma1
      -0.3012  -0.4428  -0.3561  -0.3701  -0.4130  -0.2847  0.4562  -0.2031
s.e.   0.0388   0.0286   0.0336   0.0308   0.0306   0.0332  0.0271   0.0427

sigma^2 = 2.065e+10:  log likelihood = -48138.77
AIC=96295.54   AICc=96295.59   BIC=96351.29
Code
auto.arima(cars_ts)
Series: cars_ts 
ARIMA(3,1,1)(0,0,1)[12] with drift 

Coefficients:
         ar1      ar2     ar3      ma1     sma1      drift
      1.5764  -0.8919  0.2838  -0.7001  -0.7598  2517.5353
s.e.  0.1144   0.1089  0.0527   0.1163   0.0454   770.5523

sigma^2 = 38298848:  log likelihood = -3747.36
AIC=7508.72   AICc=7509.03   BIC=7536.1
Code
auto.arima(gas_ts)
Series: gas_ts 
ARIMA(3,1,1)(2,0,0)[12] 

Coefficients:
         ar1      ar2     ar3      ma1    sar1   sar2
      1.4089  -0.6509  0.1472  -0.9324  0.0487  0.012
s.e.  0.0929   0.0959  0.0675   0.0776  0.0560  0.065

sigma^2 = 0.01877:  log likelihood = 212.69
AIC=-411.38   AICc=-411.07   BIC=-384.01
Code
auto.arima(unemp_ts)
Series: unemp_ts 
ARIMA(0,1,0) 

sigma^2 = 0.3531:  log likelihood = -331.53
AIC=665.06   AICc=665.07   BIC=668.97

Uber

Code
auto.arima(uber_ts)
Series: uber_ts 
ARIMA(0,1,0) 

sigma^2 = 2.002:  log likelihood = -2368.29
AIC=4738.58   AICc=4738.59   BIC=4743.78

Zoom

Code
auto.arima(zoom_ts)
Series: zoom_ts 
ARIMA(0,1,0) 

sigma^2 = 75.76:  log likelihood = -4804.43
AIC=9610.86   AICc=9610.86   BIC=9616.06

Exxon

Code
auto.arima(exxon_ts)
Series: exxon_ts 
ARIMA(0,1,0) 

sigma^2 = 2.533:  log likelihood = -2525.85
AIC=5053.7   AICc=5053.71   BIC=5058.91

ARIMA/SARIMA Model Fitting

In this section we will simply fit a range of models using the parameters selected above (via either the parameters selected via plotting or the auto.arima() method) to determine the ranges of each parameters. These will be fit using the Arima() function in R and include all necessary components including order and seasonality. Thus, time series that don’t require seasonal analysis will be simple ARIMA models, while those that do will be SARIMA models. The best model will be chosen based on a combination of AIC and BIC scores.

Code
ridership_ts_scaled <- na.remove(ridership_ts) / 100000
Code
p_values <- c(0:2)
d_values <- c(0:1)
q_values <- c(0:2)
P_values <- c(0:2)
D_values <- c(0:1)
Q_values <- c(0:2)
AIC_values <- c()
BIC_values <- c()
AICc_values <- c()
p_results <- c()
d_results <- c()
q_results <- c()
P_results <- c()
D_results <- c()
Q_results <- c()
for (p in p_values) {
  for (d in d_values) {
    for (q in q_values) {
      for (P in P_values) {
        for (D in D_values) {
          for (Q in Q_values) {
            model <- Arima(ridership_ts_scaled, order=c(p,d,q), seasonal = list(order=c(P,D,Q), period=4), method="ML")
            AIC_values <- append(AIC_values, model$aic)
            BIC_values <- append(BIC_values, model$bic)
            AICc_values <- append(AICc_values, model$aicc)
            p_results <- append(p_results, p)
            d_results <- append(d_results, d)
            q_results <- append(q_results, q)
            P_results <- append(P_results, P)
            D_results <- append(D_results, D)
            Q_results <- append(Q_results, Q)
          }
        }
      }
    }
  }
}
results <- data.frame(p_results, d_results, q_results, P_results, D_results, Q_results, AIC_values, BIC_values, AICc_values)
results <- results[order(AIC_values), ]
head(results)
    p_results d_results q_results P_results D_results Q_results AIC_values
59          0         1         0         0         1         1   483.2010
113         1         0         0         0         1         1   483.2873
60          0         1         0         0         1         2   484.9264
65          0         1         0         1         1         1   484.9691
77          0         1         1         0         1         1   485.1170
167         1         1         0         0         1         1   485.1177
    BIC_values AICc_values
59    488.8416    483.3002
113   491.7723    483.4857
60    493.3872    485.1264
65    493.4300    485.1691
77    493.5778    485.3170
167   493.5785    485.3177
Code
(ridership_fit <- Arima(ridership_ts, order = c(0, 1, 0), seasonal = list(order=c(0,1,1), period=4)))
Series: ridership_ts 
ARIMA(0,1,0)(0,1,1)[4] 

Coefficients:
         sma1
      -0.9841
s.e.   0.2057

sigma^2 = 2.554e+10:  log likelihood = -1667.2
AIC=3338.41   AICc=3338.51   BIC=3344.05
Code
mta_ts_scaled <- na.remove(mta_ts) / 1000000
Code
p_values <- c(0:7)
d_values <- c(0:1)
q_values <- c(0:2)
#P_values <- c(0:2)
#D_values <- c(0:1)
#Q_values <- c(0:2)
AIC_values <- c()
BIC_values <- c()
AICc_values <- c()
p_results <- c()
d_results <- c()
q_results <- c()
#P_results <- c()
#D_results <- c()
#Q_results <- c()
for (p in p_values) {
  for (d in d_values) {
    for (q in q_values) {
        model <- Arima(mta_ts_scaled, order=c(p,d,q), method = "ML")
        AIC_values <- append(AIC_values, model$aic)
        BIC_values <- append(BIC_values, model$bic)
        AICc_values <- append(AICc_values, model$aicc)
        p_results <- append(p_results, p)
        d_results <- append(d_results, d)
        q_results <- append(q_results, q)
        #P_results <- append(P_results, P)
        #D_results <- append(D_results, D)
        #Q_results <- append(Q_results, Q)
    }
  }
}
results <- data.frame(p_results, d_results, q_results, AIC_values, BIC_values, AICc_values)
results <- results[order(AIC_values), ]
head(results)
   p_results d_results q_results AIC_values BIC_values AICc_values
47         7         1         1   2659.461   2708.795    2659.563
48         7         1         2   2659.800   2714.615    2659.924
42         6         1         2   2700.342   2749.676    2700.444
46         7         1         0   2703.137   2746.990    2703.219
45         7         0         2   2703.707   2764.010    2703.856
41         6         1         1   2795.454   2839.306    2795.535
Code
(mta_fit <- Arima(mta_ts, order = c(7, 1, 1)))
Series: mta_ts 
ARIMA(7,1,1) 

Coefficients:
          ar1      ar2      ar3      ar4      ar5      ar6     ar7      ma1
      -0.0921  -0.3056  -0.2065  -0.2118  -0.2830  -0.0926  0.5559  -0.3907
s.e.   0.0409   0.0282   0.0338   0.0315   0.0307   0.0343  0.0262   0.0462

sigma^2 = 2.594e+11:  log likelihood = -25843.26
AIC=51704.52   AICc=51704.63   BIC=51753.86
Code
cta_ts_scaled <- na.remove(cta_ts) / 1000000
Code
p_values <- c(0:7)
d_values <- c(0:1)
q_values <- c(0:2)
#P_values <- c(0:2)
#D_values <- c(0:1)
#Q_values <- c(0:2)
AIC_values <- c()
BIC_values <- c()
AICc_values <- c()
p_results <- c()
d_results <- c()
q_results <- c()
#P_results <- c()
#D_results <- c()
#Q_results <- c()
for (p in p_values) {
  for (d in d_values) {
    for (q in q_values) {
        model <- Arima(cta_ts_scaled, order=c(p,d,q), method = "ML")
        AIC_values <- append(AIC_values, model$aic)
        BIC_values <- append(BIC_values, model$bic)
        AICc_values <- append(AICc_values, model$aicc)
        p_results <- append(p_results, p)
        d_results <- append(d_results, d)
        q_results <- append(q_results, q)
        #P_results <- append(P_results, P)
        #D_results <- append(D_results, D)
        #Q_results <- append(Q_results, Q)
    }
  }
}
results <- data.frame(p_results, d_results, q_results, AIC_values, BIC_values, AICc_values)
results <- results[order(AIC_values), ]
head(results)
   p_results d_results q_results AIC_values BIC_values AICc_values
47         7         1         1  -3756.390  -3700.640   -3756.340
48         7         1         2  -3755.414  -3693.469   -3755.353
46         7         1         0  -3738.845  -3689.289   -3738.806
45         7         0         2  -3732.256  -3664.113   -3732.183
42         6         1         2  -3727.977  -3672.227   -3727.927
44         7         0         1  -3625.723  -3563.775   -3625.662
Code
(cta_fit <- Arima(cta_ts, order = c(7, 1, 1)))
Series: cta_ts 
ARIMA(7,1,1) 

Coefficients:
          ar1      ar2      ar3      ar4      ar5      ar6     ar7      ma1
      -0.3012  -0.4428  -0.3561  -0.3701  -0.4130  -0.2847  0.4562  -0.2031
s.e.   0.0388   0.0286   0.0336   0.0308   0.0306   0.0332  0.0271   0.0427

sigma^2 = 2.065e+10:  log likelihood = -48138.77
AIC=96295.54   AICc=96295.59   BIC=96351.29
Code
(cars_fit <- Arima(cars_ts, order = c(3, 1, 1), seasonal = list(order = c(0, 0, 1), period = 12)))
Series: cars_ts 
ARIMA(3,1,1)(0,0,1)[12] 

Coefficients:
         ar1      ar2     ar3      ma1     sma1
      1.6468  -0.9397  0.2842  -0.7642  -0.7696
s.e.  0.0913   0.0978  0.0566   0.0905   0.0449

sigma^2 = 38655917:  log likelihood = -3749.75
AIC=7511.49   AICc=7511.73   BIC=7534.96
Code
(gas_fit <- Arima(gas_ts, order = c(3, 1, 1), seasonal = list(order = c(2, 0, 0), period = 12)))
Series: gas_ts 
ARIMA(3,1,1)(2,0,0)[12] 

Coefficients:
         ar1      ar2     ar3      ma1    sar1   sar2
      1.4089  -0.6509  0.1472  -0.9324  0.0487  0.012
s.e.  0.0929   0.0959  0.0675   0.0776  0.0560  0.065

sigma^2 = 0.01877:  log likelihood = 212.69
AIC=-411.38   AICc=-411.07   BIC=-384.01
Code
(unemp_fit <- Arima(unemp_ts, order = c(2, 1, 2)))
Series: unemp_ts 
ARIMA(2,1,2) 

Coefficients:
         ar1     ar2      ma1      ma2
      0.1103  0.4672  -0.1006  -0.6238
s.e.  0.1968  0.1932   0.1736   0.1705

sigma^2 = 0.3458:  log likelihood = -325.69
AIC=661.38   AICc=661.55   BIC=680.94

Uber

Code
(uber_fit <- Arima(uber_ts, order = c(0, 1, 0)))
Series: uber_ts 
ARIMA(0,1,0) 

sigma^2 = 2.002:  log likelihood = -2368.29
AIC=4738.58   AICc=4738.59   BIC=4743.78

Zoom

Code
(zoom_fit <- Arima(zoom_ts, order = c(0, 1, 0)))
Series: zoom_ts 
ARIMA(0,1,0) 

sigma^2 = 75.76:  log likelihood = -4804.43
AIC=9610.86   AICc=9610.86   BIC=9616.06

Exxon

Code
(exxon_fit <- Arima(exxon_ts, order = c(0, 1, 0)))
Series: exxon_ts 
ARIMA(0,1,0) 

sigma^2 = 2.533:  log likelihood = -2525.85
AIC=5053.7   AICc=5053.71   BIC=5058.91

Full Model Diagnostics

Below we can see model diagnostics such as AIC and BIC for each final univariate model. These will be useful when comparing to multivariate, financial, and deep learning models in order to evaluate which methods are best at matching time series data for this project.

Code
capture.output(ridership_fit)
 [1] "Series: ridership_ts "                         
 [2] "ARIMA(0,1,0)(0,1,1)[4] "                       
 [3] ""                                              
 [4] "Coefficients:"                                 
 [5] "         sma1"                                 
 [6] "      -0.9841"                                 
 [7] "s.e.   0.2057"                                 
 [8] ""                                              
 [9] "sigma^2 = 2.554e+10:  log likelihood = -1667.2"
[10] "AIC=3338.41   AICc=3338.51   BIC=3344.05"      
Code
capture.output(mta_fit)
 [1] "Series: mta_ts "                                                            
 [2] "ARIMA(7,1,1) "                                                              
 [3] ""                                                                           
 [4] "Coefficients:"                                                              
 [5] "          ar1      ar2      ar3      ar4      ar5      ar6     ar7      ma1"
 [6] "      -0.0921  -0.3056  -0.2065  -0.2118  -0.2830  -0.0926  0.5559  -0.3907"
 [7] "s.e.   0.0409   0.0282   0.0338   0.0315   0.0307   0.0343  0.0262   0.0462"
 [8] ""                                                                           
 [9] "sigma^2 = 2.594e+11:  log likelihood = -25843.26"                           
[10] "AIC=51704.52   AICc=51704.63   BIC=51753.86"                                
Code
capture.output(cta_fit)
 [1] "Series: cta_ts "                                                            
 [2] "ARIMA(7,1,1) "                                                              
 [3] ""                                                                           
 [4] "Coefficients:"                                                              
 [5] "          ar1      ar2      ar3      ar4      ar5      ar6     ar7      ma1"
 [6] "      -0.3012  -0.4428  -0.3561  -0.3701  -0.4130  -0.2847  0.4562  -0.2031"
 [7] "s.e.   0.0388   0.0286   0.0336   0.0308   0.0306   0.0332  0.0271   0.0427"
 [8] ""                                                                           
 [9] "sigma^2 = 2.065e+10:  log likelihood = -48138.77"                           
[10] "AIC=96295.54   AICc=96295.59   BIC=96351.29"                                
Code
capture.output(cars_fit)
 [1] "Series: cars_ts "                               
 [2] "ARIMA(3,1,1)(0,0,1)[12] "                       
 [3] ""                                               
 [4] "Coefficients:"                                  
 [5] "         ar1      ar2     ar3      ma1     sma1"
 [6] "      1.6468  -0.9397  0.2842  -0.7642  -0.7696"
 [7] "s.e.  0.0913   0.0978  0.0566   0.0905   0.0449"
 [8] ""                                               
 [9] "sigma^2 = 38655917:  log likelihood = -3749.75" 
[10] "AIC=7511.49   AICc=7511.73   BIC=7534.96"       
Code
capture.output(gas_fit)
 [1] "Series: gas_ts "                                      
 [2] "ARIMA(3,1,1)(2,0,0)[12] "                             
 [3] ""                                                     
 [4] "Coefficients:"                                        
 [5] "         ar1      ar2     ar3      ma1    sar1   sar2"
 [6] "      1.4089  -0.6509  0.1472  -0.9324  0.0487  0.012"
 [7] "s.e.  0.0929   0.0959  0.0675   0.0776  0.0560  0.065"
 [8] ""                                                     
 [9] "sigma^2 = 0.01877:  log likelihood = 212.69"          
[10] "AIC=-411.38   AICc=-411.07   BIC=-384.01"             
Code
capture.output(unemp_fit)
 [1] "Series: unemp_ts "                          
 [2] "ARIMA(2,1,2) "                              
 [3] ""                                           
 [4] "Coefficients:"                              
 [5] "         ar1     ar2      ma1      ma2"     
 [6] "      0.1103  0.4672  -0.1006  -0.6238"     
 [7] "s.e.  0.1968  0.1932   0.1736   0.1705"     
 [8] ""                                           
 [9] "sigma^2 = 0.3458:  log likelihood = -325.69"
[10] "AIC=661.38   AICc=661.55   BIC=680.94"      

Uber

Code
capture.output(uber_fit)
[1] "Series: uber_ts "                           
[2] "ARIMA(0,1,0) "                              
[3] ""                                           
[4] "sigma^2 = 2.002:  log likelihood = -2368.29"
[5] "AIC=4738.58   AICc=4738.59   BIC=4743.78"   

Zoom

Code
capture.output(zoom_fit)
[1] "Series: zoom_ts "                           
[2] "ARIMA(0,1,0) "                              
[3] ""                                           
[4] "sigma^2 = 75.76:  log likelihood = -4804.43"
[5] "AIC=9610.86   AICc=9610.86   BIC=9616.06"   

Exxon

Code
capture.output(exxon_fit)
[1] "Series: exxon_ts "                          
[2] "ARIMA(0,1,0) "                              
[3] ""                                           
[4] "sigma^2 = 2.533:  log likelihood = -2525.85"
[5] "AIC=5053.7   AICc=5053.71   BIC=5058.91"    

Forecasting

Now that our models have been fit and diagnostics have been recorded, we can get a better sense of what these models actually predict by forecasting several time steps ahead. This will better contextualize what the models actually learn, and provide valuable insight towards what we can expect in the future.

Code
ridership_pred <- forecast(ridership_fit, h = 20)
accuracy(ridership_pred)
                   ME     RMSE      MAE       MPE    MAPE      MASE       ACF1
Training set -4682.87 156042.4 54633.29 -1.286491 4.18834 0.4239048 -0.0271934
Code
autoplot(ridership_pred) +
  labs(title = "ARIMA(1,1,2)(1,0,1)[4] Forecast for National Ridership",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

This forecast indicates that that ridership is expected to maintain its seasonal pattern while trending slightly down starting in the next time step. As we can see from the confidence bands, this is not a very assured prediction. It is very likely that the variance caused in 2020, which we saw was not totally accounted for in the differencing step, is still causing confusion. This prediction is quite counter-intuitive seeing the gradual rise we’ve observed since the COVID-19 dip.

Code
mta_pred <- forecast(mta_fit, 365)
autoplot(mta_pred) +
  labs(title = "ARIMA(7,1,2) Forecast for New York Ridership",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

This prediction appears to indicate that ridership in New York will continue to gradually rise, while losing variance as time goes on. Once again, these are not very confident predictions. It is important to note that we observed both weekly and yearly seasonality in this data. ARIMA models notably struggle when dealing with concurrent seasonal periods, so this could be contributing to an inability to produce a coherent forecast.

Code
cta_pred <- forecast(cta_fit, 365)
autoplot(cta_pred) +
  labs(title = "ARIMA(7,1,2) Forecast for Chicago Ridership",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

This prediction appears to indicate that ridership in Chicago will gradually fall, while losing variance as time goes on. Once again, these are not very confident predictions. It is important to note that we observed both weekly and yearly seasonality in this data. ARIMA models notably struggle when dealing with concurrent seasonal periods, so this could be contributing to an inability to produce a coherent forecast.

Code
ridership_pred <- forecast(ridership_fit, h = 20)
accuracy(ridership_pred)
                   ME     RMSE      MAE       MPE    MAPE      MASE       ACF1
Training set -4682.87 156042.4 54633.29 -1.286491 4.18834 0.4239048 -0.0271934
Code
cars_pred <- forecast(cars_fit, 36)
autoplot(cars_pred) +
  labs(title = "ARIMA(3,1,1)(0,0,1)[12] Forecast for Vehicle Miles",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

This forecast does not tell us much, as our model appears to have struggled to create a coherent prediction. Ultimately, the forecast predicts a rather constant value with very wide confidence bands.

Code
gas_pred <- forecast(gas_fit, 36)
autoplot(gas_pred) +
  labs(title = "ARIMA(3,1,1)(2,0,0)[12] Forecast for Gas Prices",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal() 

This forecast does not tell us much, as our model appears to have struggled to create a coherent prediction. Ultimately, the forecast predicts a rather constant value with very wide confidence bands.

Code
unemp_pred <- forecast(unemp_fit, 36)
autoplot(unemp_pred) +
  labs(title = "ARIMA(2,1,2) Forecast for Unemployment Rate",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

This forecast does not tell us much, as our model appears to have struggled to create a coherent prediction. Ultimately, the forecast predicts a rather constant value with very wide confidence bands.

Uber

Code
uber_pred <- forecast(uber_fit, 252)
autoplot(uber_pred) +
  labs(title = "ARIMA(0,1,0) Forecast for Uber Stock Price",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

Zoom

Code
zoom_pred <- forecast(zoom_fit, 252)
autoplot(zoom_pred) +
  labs(title = "ARIMA(0,1,0) Forecast for Zoom Stock Price",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

Exxon

Code
exxon_pred <- forecast(exxon_fit, 252)
autoplot(exxon_pred) +
  labs(title = "ARIMA(0,1,0) Forecast for Exxon Stock Price",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

These forecasts do not tell us much, as our models appear to have struggled to create coherent predictions. Ultimately, the forecasts predict rather constant values with very wide confidence bands.

Comparison with Benchmark Methods

Finally, to truly understand the value of univariate time series analysis via fitting ARIMA/SARIMA models, it is important to compare with benchmark methods and see how they stack up. The following plots include the ARIMA forecasts alongside a mean forecast, a naïve forecast, and a drift forecast.

Code
mean_ridership <- meanf(ridership_ts, h = 20)
naive_ridership <- naive(ridership_ts, h = 20)
drift_ridership <- rwf(ridership_ts, drift = TRUE, h = 20)
arima_ridership <- ridership_pred

mean_ridership_df <- data.frame(Date = time(mean_ridership$mean), Mean = as.numeric(mean_ridership$mean))
naive_ridership_df <- data.frame(Date = time(naive_ridership$mean), Naive = as.numeric(naive_ridership$mean))
drift_ridership_df <- data.frame(Date = time(drift_ridership$mean), Drift = as.numeric(drift_ridership$mean))
arima_ridership_df <- data.frame(Date = time(arima_ridership$mean), ARIMA_Fit = as.numeric(arima_ridership$mean))

ridership_ts_df <- data.frame(Date = time(ridership_ts), Ridership = as.numeric(ridership_ts))

plot_ly() %>%
  add_lines(data = ridership_ts_df, x = ~Date, y = ~Ridership, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_ridership_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_ridership_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_ridership_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_ridership_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Ridership Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Number of Rides (000s)'),
         legend = list(title = list(text = 'Forecast Methods')))
Code
mean_mta <- meanf(mta_ts, h = 365)
naive_mta <- naive(mta_ts, h = 365)
drift_mta <- rwf(mta_ts, drift = TRUE, h = 365)
arima_mta <- mta_pred

mean_mta_df <- data.frame(Date = time(mean_mta$mean), Mean = as.numeric(mean_mta$mean))
naive_mta_df <- data.frame(Date = time(naive_mta$mean), Naive = as.numeric(naive_mta$mean))
drift_mta_df <- data.frame(Date = time(drift_mta$mean), Drift = as.numeric(drift_mta$mean))
arima_mta_df <- data.frame(Date = time(arima_mta$mean), ARIMA_Fit = as.numeric(arima_mta$mean))

mta_ts_df <- data.frame(Date = time(mta_ts), Ridership = as.numeric(mta_ts))

plot_ly() %>%
  add_lines(data = mta_ts_df, x = ~Date, y = ~Ridership, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_mta_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_mta_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_mta_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_mta_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'MTA Ridership Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Number of Rides'),
         legend = list(title = list(text = 'Forecast Methods')))
Code
mean_cta <- meanf(cta_ts, h = 365)
naive_cta <- naive(cta_ts, h = 365)
drift_cta <- rwf(cta_ts, drift = TRUE, h = 365)
arima_cta <- cta_pred

mean_cta_df <- data.frame(Date = time(mean_cta$mean), Mean = as.numeric(mean_cta$mean))
naive_cta_df <- data.frame(Date = time(naive_cta$mean), Naive = as.numeric(naive_cta$mean))
drift_cta_df <- data.frame(Date = time(drift_cta$mean), Drift = as.numeric(drift_cta$mean))
arima_cta_df <- data.frame(Date = time(arima_cta$mean), ARIMA_Fit = as.numeric(arima_cta$mean))

cta_ts_df <- data.frame(Date = time(cta_ts), Ridership = as.numeric(cta_ts))

plot_ly() %>%
  add_lines(data = cta_ts_df, x = ~Date, y = ~Ridership, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_cta_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_cta_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_cta_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_cta_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'CTA Ridership Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Number of Rides'),
         legend = list(title = list(text = 'Forecast Methods')))
Code
mean_cars <- meanf(cars_ts, h = 36)
naive_cars <- naive(cars_ts, h = 36)
drift_cars <- rwf(cars_ts, drift = TRUE, h = 36)
arima_cars <- cars_pred

mean_cars_df <- data.frame(Date = time(mean_cars$mean), Mean = as.numeric(mean_cars$mean))
naive_cars_df <- data.frame(Date = time(naive_cars$mean), Naive = as.numeric(naive_cars$mean))
drift_cars_df <- data.frame(Date = time(drift_cars$mean), Drift = as.numeric(drift_cars$mean))
arima_cars_df <- data.frame(Date = time(arima_cars$mean), ARIMA_Fit = as.numeric(arima_cars$mean))

cars_ts_df <- data.frame(Date = time(cars_ts), Miles = as.numeric(cars_ts))

plot_ly() %>%
  add_lines(data = cars_ts_df, x = ~Date, y = ~Miles, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_cars_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_cars_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_cars_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_cars_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Vehicle Miles Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Million Vehicle Miles'),
         legend = list(title = list(text = 'Forecast Methods')))
Code
mean_gas <- meanf(gas_ts, h = 36)
naive_gas <- naive(gas_ts, h = 36)
drift_gas <- rwf(gas_ts, drift = TRUE, h = 36)
arima_gas <- gas_pred

mean_gas_df <- data.frame(Date = time(mean_gas$mean), Mean = as.numeric(mean_gas$mean))
naive_gas_df <- data.frame(Date = time(naive_gas$mean), Naive = as.numeric(naive_gas$mean))
drift_gas_df <- data.frame(Date = time(drift_gas$mean), Drift = as.numeric(drift_gas$mean))
arima_gas_df <- data.frame(Date = time(arima_gas$mean), ARIMA_Fit = as.numeric(arima_gas$mean))

gas_ts_df <- data.frame(Date = time(gas_ts), Prices = as.numeric(gas_ts))

plot_ly() %>%
  add_lines(data = gas_ts_df, x = ~Date, y = ~Prices, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_gas_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_gas_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_gas_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_gas_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Gas Prices Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Average Gas Price'),
         legend = list(title = list(text = 'Forecast Methods')))
Code
mean_unemp <- meanf(unemp_ts, h = 36)
naive_unemp <- naive(unemp_ts, h = 36)
drift_unemp <- rwf(unemp_ts, drift = TRUE, h = 36)
arima_unemp <- unemp_pred

mean_unemp_df <- data.frame(Date = time(mean_unemp$mean), Mean = as.numeric(mean_unemp$mean))
naive_unemp_df <- data.frame(Date = time(naive_unemp$mean), Naive = as.numeric(naive_unemp$mean))
drift_unemp_df <- data.frame(Date = time(drift_unemp$mean), Drift = as.numeric(drift_unemp$mean))
arima_unemp_df <- data.frame(Date = time(arima_unemp$mean), ARIMA_Fit = as.numeric(arima_unemp$mean))

unemp_ts_df <- data.frame(Date = time(unemp_ts), Rate = as.numeric(unemp_ts))

plot_ly() %>%
  add_lines(data = unemp_ts_df, x = ~Date, y = ~Rate, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_unemp_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_unemp_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_unemp_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_unemp_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Unemployment Rate Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Unemployment Rate'),
         legend = list(title = list(text = 'Forecast Methods')))

Uber

Code
mean_uber <- meanf(uber_ts, h = 252)
naive_uber <- naive(uber_ts, h = 252)
drift_uber <- rwf(uber_ts, drift = TRUE, h = 252)
arima_uber <- uber_pred

mean_uber_df <- data.frame(Date = time(mean_uber$mean), Mean = as.numeric(mean_uber$mean))
naive_uber_df <- data.frame(Date = time(naive_uber$mean), Naive = as.numeric(naive_uber$mean))
drift_uber_df <- data.frame(Date = time(drift_uber$mean), Drift = as.numeric(drift_uber$mean))
arima_uber_df <- data.frame(Date = time(arima_uber$mean), ARIMA_Fit = as.numeric(arima_uber$mean))

uber_ts_df <- data.frame(Date = time(uber_ts), Stock = as.numeric(uber_ts))

plot_ly() %>%
  add_lines(data = uber_ts_df, x = ~Date, y = ~Stock, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_uber_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_uber_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_uber_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_uber_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Uber Stock Price Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Stock Price'),
         legend = list(title = list(text = 'Forecast Methods')))

Zoom

Code
mean_zoom <- meanf(zoom_ts, h = 252)
naive_zoom <- naive(zoom_ts, h = 252)
drift_zoom <- rwf(zoom_ts, drift = TRUE, h = 252)
arima_zoom <- zoom_pred

mean_zoom_df <- data.frame(Date = time(mean_zoom$mean), Mean = as.numeric(mean_zoom$mean))
naive_zoom_df <- data.frame(Date = time(naive_zoom$mean), Naive = as.numeric(naive_zoom$mean))
drift_zoom_df <- data.frame(Date = time(drift_zoom$mean), Drift = as.numeric(drift_zoom$mean))
arima_zoom_df <- data.frame(Date = time(arima_zoom$mean), ARIMA_Fit = as.numeric(arima_zoom$mean))

zoom_ts_df <- data.frame(Date = time(zoom_ts), Stock = as.numeric(zoom_ts))

plot_ly() %>%
  add_lines(data = zoom_ts_df, x = ~Date, y = ~Stock, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_zoom_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_zoom_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_zoom_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_zoom_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Zoom Stock Price Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Stock Price'),
         legend = list(title = list(text = 'Forecast Methods')))

Exxon

Code
mean_exxon <- meanf(exxon_ts, h = 252)
naive_exxon <- naive(exxon_ts, h = 252)
drift_exxon <- rwf(exxon_ts, drift = TRUE, h = 252)
arima_exxon <- exxon_pred

mean_exxon_df <- data.frame(Date = time(mean_exxon$mean), Mean = as.numeric(mean_exxon$mean))
naive_exxon_df <- data.frame(Date = time(naive_exxon$mean), Naive = as.numeric(naive_exxon$mean))
drift_exxon_df <- data.frame(Date = time(drift_exxon$mean), Drift = as.numeric(drift_exxon$mean))
arima_exxon_df <- data.frame(Date = time(arima_exxon$mean), ARIMA_Fit = as.numeric(arima_exxon$mean))

exxon_ts_df <- data.frame(Date = time(exxon_ts), Stock = as.numeric(exxon_ts))

plot_ly() %>%
  add_lines(data = exxon_ts_df, x = ~Date, y = ~Stock, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_exxon_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_exxon_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_exxon_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_exxon_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Exxon Stock Price Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Stock Price'),
         legend = list(title = list(text = 'Forecast Methods')))

In all of these cases, the mean forecast is rather useless, as it begins from a value incongruent with the current time step. ARIMA fits tend to provide more information than benchmark methods when they have a seasonal component, but the reality is that we have not gotten particularly valuable insights from univariate time series analysis. Moving forward, it will be valuable to see how the presence of exogenous variables or more complex time series analysis methods contribute to improved forecasts.